#'
#'  Project title:     'Sovereign Risk and Government Change: Elections, Ideology and Experience'
#'  Authors:           Sarah M. Brooks; Raphael Cunha; Layna Mosley
#'  File description:  Creates functions for estimating heteroskedastic regression models and for
#'                     testing for autocorrelation in panel data with fixed effects
#'  Output:            hetreg() and woolARtest() functions
#'

required_packages <- c("tidyverse", "gamlss", "lmtest", "plm")
new_packages <- required_packages[!(required_packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

library(tidyverse)
library(gamlss)
library(lmtest)
library(plm)


# Heteroskedastic regression estimation and inference ---------------------

# Define function for estimating heteroskedastic regression
# model with fixed effects and clustered standard errors

# y.var = outcome variable
# x.var = right-hand side variables
# x.int = right-hand side variables that enter as multiplicative interaction
# x.het = right-hand side variables in conditional variance equation
# fe.var = unit variable for fixed effects
# time.var = time variable for fixed effects

hetreg <- function(y.var, x.var, x.int = NULL, x.het, fe.var, time.var = NULL, data, clusterSE = TRUE){
  
  if(is.null(x.int)){
    
    if(is.null(time.var)){
      
      fmla.subset <- paste(paste(paste("!is.na(", y.var, ")", sep = ""), collapse = " & "),
                           paste(paste("!is.na(", x.var, ")", sep = ""), collapse = " & "),
                           paste(paste("!is.na(", x.het, ")", sep = ""), collapse = " & "),
                           paste(paste("!is.na(", fe.var, ")", sep = ""), collapse = " & "),
                           sep = " & ")
      
      gamlss_dat <- data %>%
        filter_(fmla.subset) %>%
        dplyr::select(one_of(c(y.var, x.var, x.het, fe.var, "year", "month_year"))) %>%
        as.data.frame()
    }
    
    if(!is.null(time.var)){
      
      fmla.subset <- paste(paste(paste("!is.na(", y.var, ")", sep = ""), collapse = " & "),
                           paste(paste("!is.na(", x.var, ")", sep = ""), collapse = " & "),
                           paste(paste("!is.na(", x.het, ")", sep = ""), collapse = " & "),
                           paste(paste("!is.na(", fe.var, ")", sep = ""), collapse = " & "),
                           paste(paste("!is.na(", time.var, ")", sep = ""), collapse = " & "),
                           sep = " & ")
      
      gamlss_dat <- data %>%
        filter_(fmla.subset) %>%
        dplyr::select(one_of(c(y.var, x.var, x.het, fe.var, time.var, "year", "month_year"))) %>%
        as.data.frame()
    }
  }
  
  if(!is.null(x.int)){
    
    if(is.null(time.var)){
      
      fmla.subset <- paste(paste(paste("!is.na(", y.var, ")", sep = ""), collapse = " & "),
                           paste(paste("!is.na(", x.var, ")", sep = ""), collapse = " & "),
                           paste(paste("!is.na(", x.int, ")", sep = ""), collapse = " & "),
                           paste(paste("!is.na(", x.het, ")", sep = ""), collapse = " & "),
                           paste(paste("!is.na(", fe.var, ")", sep = ""), collapse = " & "),
                           sep = " & ")
      
      gamlss_dat <- data %>%
        filter_(fmla.subset) %>%
        dplyr::select(one_of(c(y.var, x.var, x.int, x.het, fe.var, "year", "month_year"))) %>%
        as.data.frame()
    }
    
    if(!is.null(time.var)){
      
      fmla.subset <- paste(paste(paste("!is.na(", y.var, ")", sep = ""), collapse = " & "),
                           paste(paste("!is.na(", x.var, ")", sep = ""), collapse = " & "),
                           paste(paste("!is.na(", x.int, ")", sep = ""), collapse = " & "),
                           paste(paste("!is.na(", x.het, ")", sep = ""), collapse = " & "),
                           paste(paste("!is.na(", fe.var, ")", sep = ""), collapse = " & "),
                           paste(paste("!is.na(", time.var, ")", sep = ""), collapse = " & "),
                           sep = " & ")
      
      gamlss_dat <- data %>%
        filter_(fmla.subset) %>%
        dplyr::select(one_of(c(y.var, x.var, x.int, x.het, fe.var, time.var, "year", "month_year"))) %>%
        as.data.frame()
    }
  }
  
  assign("gamlss_dat", gamlss_dat, envir = .GlobalEnv)
  
  # Create formulas for estimation
  # Mean and volatility (sigma) equations
  
  if(!is.null(x.int)){
    
    if(!is.null(time.var)){
      
      fmla.mu <- as.formula(paste(y.var, "~ ",
                                  paste(combn(x.int, 1), collapse = "+"), "+",
                                  paste(apply(combn(x.int, 2), 2, function(x) paste(x, collapse = "*")), collapse = " + "), "+",
                                  paste(x.int, collapse = "*"), "+",
                                  paste(x.var, collapse = "+"), "+",
                                  paste("factor(", fe.var, ")", sep = ""), "+",
                                  paste("factor(", time.var, ")", sep = "")))
      
      fmla.sigma <- as.formula(paste("~ ",
                                     paste(combn(x.int, 1), collapse = "+"), "+",
                                     paste(apply(combn(x.int,2), 2, function(x) paste(x, collapse = "*")), collapse = " + "), "+",
                                     paste(x.int, collapse = "*"), "+",
                                     paste(x.het, collapse = "+")))
    }
    
    if(is.null(time.var)){
      
      fmla.mu <- as.formula(paste(y.var, "~ ",
                                  paste(combn(x.int, 1), collapse = "+"), "+",
                                  paste(apply(combn(x.int, 2), 2, function(x) paste(x, collapse = "*")), collapse = " + "), "+",
                                  paste(x.int, collapse = "*"), "+",
                                  paste(x.var, collapse = "+"), "+",
                                  paste("factor(", fe.var, ")", sep = "")))
      
      fmla.sigma <- as.formula(paste("~ ",
                                     paste(combn(x.int, 1), collapse = "+"), "+",
                                     paste(apply(combn(x.int,2), 2, function(x) paste(x, collapse = "*")), collapse = " + "), "+",
                                     paste(x.int, collapse = "*"), "+",
                                     paste(x.het, collapse = "+")))
      }
    }
  
  if(is.null(x.int)){
    
    if(!is.null(time.var)){
      
      fmla.mu <- as.formula(paste(y.var, "~ ",
                                  paste(x.var, collapse = "+"), "+",
                                  paste("factor(", fe.var, ")", sep = ""), "+",
                                  paste("factor(", time.var, ")", sep = "")))
      
      fmla.sigma <- as.formula(paste("~ ",
                                     paste(x.het, collapse = "+")))
    }
    
    if(is.null(time.var)){
      
      fmla.mu <- as.formula(paste(y.var, "~ ",
                                  paste(x.var, collapse = "+"), "+",
                                  paste("factor(", fe.var, ")", sep = "")))
      
      fmla.sigma <- as.formula(paste("~ ",
                                     paste(x.het, collapse = "+")))
    }
  }
  
  # Estimate GAMLSS model with mean and variance equations
  
  mod <- gamlss(formula = fmla.mu,
                sigma.formula = fmla.sigma,
                data = gamlss_dat,
                family = NO(mu.link = "identity",
                            sigma.link = "log")) # Normal distribution
                
  # Compute clustered standard errors for GAMLSS model
  
  if(clusterSE == TRUE){
    
    coef.mu <- data.frame(coefficients = c(coef(mod, what = "mu")))
    rownames(coef.mu) <- paste("mu_", rownames(coef.mu), sep = "")
    
    coef.sigma <- data.frame(coefficients = c(coef(mod, what = "sigma")))
    rownames(coef.sigma) <- paste("sigma_", rownames(coef.sigma), sep = "")
    
    coef.mod <- rbind(coef.mu, coef.sigma)
    
    # Bread for sandwich estimator (var-cov matrix)
    bread <- vcov(mod)
    
    # Meat for robust sandwich estimator
    est.fun <- get.K(mod, what = "Deriv")
    meat <- t(est.fun) %*% est.fun
    sandwich <- bread %*% meat %*% bread
    
    # Meat for clustered sandwich estimator
    
    # Identify clusters
    fc <- as.character(gamlss_dat[, fe.var])
    m <- length(unique(fc))
    k <- length(coef(coef.mod))
    
    # Sum the u's by cluster
    u <- get.K(mod, what = "Deriv")
    u.clust <- matrix(NA, nrow = m, ncol = k)
    
    for(j in 1:k){
      
      u.clust[,j] <- tapply(u[,j], fc, sum)
      
    }
    
    # Make cluster-robust matrix:
    cl.vcov <- bread %*% ((m / (m-1)) * t(u.clust) %*% (u.clust)) %*% bread
    
    # Test coefficients
    
    # First, add modified variable names to vcov matrix to identify mu and sigma equations
    rownames(bread) <- rownames(coef.mod)
    colnames(bread) <- rownames(coef.mod)
    
    rownames(sandwich) <- rownames(coef.mod)
    colnames(sandwich) <- rownames(coef.mod)
    
    rownames(cl.vcov) <- rownames(coef.mod)
    colnames(cl.vcov) <- rownames(coef.mod)
    
    # Exclude fixed effects estimates from vcov and coefficients
    bread <- bread[-grep("factor", rownames(bread)), -grep("factor", colnames(bread))]
    sandwich <- sandwich[-grep("factor", rownames(sandwich)), -grep("factor", colnames(sandwich))]
    cl.vcov <- cl.vcov[-grep("factor", rownames(cl.vcov)), -grep("factor", colnames(cl.vcov))]
    coef.mod <- subset(coef.mod, !grepl("factor", rownames(coef.mod)))
    
    # Test coefficients using cluster-robust vcov
    cl.SE.table <- coeftest(coef.mod, vcov. = cl.vcov)
    
    # Get years and countries covered
    years <- c(min(gamlss_dat$year), max(gamlss_dat$year))
    country_names <- unique(gamlss_dat$country)
    
    # Return results
    
    return(list(cl.SE.table = cl.SE.table,
                nobs = mod$N,
                countries = m,
                country_names = country_names,
                years = years,
                aic = round(mod$aic, digits = 2),
                sbc = round(mod$sbc, digits = 2),
                betas = coef.mod,
                cl.vcov = cl.vcov,
                sandwich = sandwich,
                bread = bread,
                mod = mod,
                data = gamlss_dat))
    
    rm(gamlss_dat, envir = .GlobalEnv)
  }
  
  else{
    return(mod)
    rm(gamlss_dat, envir = .GlobalEnv)
  }
}


# Autocorrelation test for panel data -------------------------------------

#' Wooldridge Test for AR(1) Errors in FE Panel Models:
#' Under the null of no serial correlation in the errors,
#' the residuals of a FE model must be negatively serially correlated,
#' with cor(\hat{u}_{it}, \hat{u}_{is}) = -1/(T-1) for each
#' t,s. He suggests basing a test for this null hypothesis on a
#' pooled regression of FE residuals on their first lag:
#' \hat{u}_{i,t} = \alpha + \delta \hat{u}_{i,t-1} +
#' \eta_{i,t}. Rejecting the restriction \delta = -1/(T-1)
#' makes us conclude against the original null of no serial
#' correlation.

#' The test estimates a FE model and retrieves residuals,
#' then estimates an AR(1) pooling model on them. The test statistic
#' is obtained by applying an F test to the latter model to test the
#' above restriction on delta, setting the covariance matrix to
#' `vcovHC` with the option `method="arellano"` to control for serial
#' correlation.

#' This test does not rely on large--T asymptotics and has therefore
#' good properties in short panels. It is also robust to general
#' heteroskedasticity.

woolARtest <- function (model_residuals, model_data, id_var, time_var) {
  
  FEres <- model_residuals
  data <- model_data
  
  # add residuals and lagged residuals to data frame
  data$FEres <- FEres
  data$FEres.1 <- dplyr::lag(data$FEres)
  
  # format as panel data
  data <- pdata.frame(data, index = c(id_var, time_var))
  data <- na.omit(data)
  data <- data %>%
    dplyr::filter_all(all_vars(!is.infinite(.)))
  
  # calculate auxiliary model
  auxmod <- plm(FEres ~ FEres.1, data = data, model = "pooling", index = c(id_var, time_var))
  
  # calculate theoretical rho under H0: no serial corr. in errors
  t <- pdim(data)$nT$T
  rho.H0 <- -1/(t-1)
  myH0 <- paste("FEres.1 = ", as.character(rho.H0), sep="")
  
  # test H0: rho = rho.H0 with HAC
  myvcov <- function(x) vcovHC(x, method = "arellano") # more params may be passed via ellipsis
  
  # calculate F-stat with restriction rho.H0 and robust vcov
  FEARstat <- ((coef(auxmod)["FEres.1"] - rho.H0)/sqrt(myvcov(auxmod)["FEres.1", "FEres.1"]))^2
  names(FEARstat) <- "F"
  df1 <- c("df1" = 1)
  df2 <- c("df2" = df.residual(auxmod))
  pFEARstat <- pf(FEARstat, df1 = df1, df2 = df2, lower.tail = FALSE)
  
  # insert usual htest features
  dname <- paste(deparse(substitute(model_data)))
  RVAL <- list(statistic = FEARstat,
               parameter = c(df1, df2),
               p.value   = pFEARstat,
               method = "Wooldridge's test for serial correlation in FE panels",
               alternative = "serial correlation",
               data.name = dname)
  class(RVAL) <- "htest"
  return(RVAL)
}

